export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000

#VLLM_PP_LAYER_PARTITION="31,30" vllm serve /mnt/nvme1n1/deepseek-r1-awq --served_model_name "deepseek-r1" -tp 16 -pp 1 --max-model-len $[16*1024]  --max_num_seqs 4  --gpu_memory_utilization 0.92 --quantization awq_marlin --disable_log_stats --trust-remote-code

vllm serve /mnt/nvme1n1/deepseek-r1-awq --served_model_name "deepseek-r1" -tp 16 -pp 1 --max-model-len $[16*1024] --max_num_batched_tokens $[8*1024] --max_num_seqs 32 --enable-chunked-prefill=True --gpu_memory_utilization 0.92 --quantization awq_marlin --disable_log_stats --trust-remote-code --host 0.0.0.0 --port 8000
#vllm serve /mnt/nvme1n1/deepseek-r1-awq --served_model_name "deepseek-r1" -tp 16 -pp 1 --max-model-len $[80*1024] --max_num_batched_tokens $[2*1024] --max_num_seqs 32 --enable-chunked-prefill=True --gpu_memory_utilization 0.94 --quantization awq_marlin --disable_log_stats --trust-remote-code
#VLLM_PP_LAYER_PARTITION="16,15,15,15" vllm serve /share/models/DeepSeek-R1-awq --served-model-name deepseek-r1 -tp 4 -pp 4 --max-model-len 8192 --gpu-memory-utilization 0.92 --dtype float16 --quantization awq_marlin --disable_log_stats --trust_remote_code

# --enable-prefix-caching=True